import datetime
import os
import pandas as pd
import openpyxl
from bs4 import BeautifulSoup
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
from gitee_config import OUTPUT_DIRS_DEFAULT
from spider import spider


def __get_github_repos_in_class(classify, url):
    flag = [classify]
    tasks = list()
    for page in range(1, 11):
        link = url + '&page=' + str(page)
        task = (link, flag)
        tasks.append(task)
    result = spider(10, tasks, 1)
    repos = list()
    for flag, response in result:
        if response.status_code == 200:
            repo_items = response.json()
            # print(type(repo_items))
            for key, values in repo_items.items():
                if key == 'items':
                    for repo_item in values:
                        repo_name = repo_item['name']
                        full_name = repo_item['full_name']
                        url = repo_item['html_url']
                        description = repo_item['description']
                        create_date = repo_item['created_at']
                        update_date = repo_item['updated_at']
                        stars = repo_item['stargazers_count']
                        size = repo_item['size']
                        forks = repo_item['forks_count']
                        issues = repo_item['open_issues']
                        language = repo_item['language']
                        repo = repo_name, full_name, language, stars, size, forks, issues, url, update_date, create_date, description
                        repos.append(repo)
        else:
            print("ERR:Failed to connect gitee repos")
    repos = get_language_weight(repos)
    print(repos)
    return repos

def get_language_weight(repos):
    tasks = list()
    new_repos = list()
    for repo in repos:
        flag = repo[1]
        link = repo[7]
        task = (link, flag)
        tasks.append(task)
    result = spider(1000, tasks, 1)
    for flag, response in result:
        if response and response.status_code == 200:
            language_weight = [0, 0, 0, 0, 0]
            resp = response.text
            soup = BeautifulSoup(resp, "html.parser")
            main_div = soup.find('div', class_='application-main')
            div_2rd = main_div.find('div', class_='repository-content', id='repo-content-pjax-container')
            div_3rd = div_2rd.find('div', class_='Layout-sidebar')
            div_list = div_3rd.find_all('div', class_='BorderGrid-cell')
            for div in div_list:
                try:
                    ul = div.find('ul', class_='list-style-none')
                    li_list = ul.find_all('li', class_='d-inline')
                    for li in li_list:
                        span_list = li.find_all('span')
                        if span_list[0].text == 'C':
                            c_weight = span_list[1].text
                            language_weight[0] = c_weight
                        if span_list[0].text == 'C++':
                            cpp_weight = span_list[1].text
                            language_weight[1] = cpp_weight
                        if span_list[0].text == 'Python':
                            python_weight = span_list[1].text
                            language_weight[2] = python_weight
                        if span_list[0].text == 'JavaScript':
                            js_weight = span_list[1].text
                            language_weight[3] = js_weight
                except:
                    continue
            weight_num = 0
            for language in language_weight:
                if language != 0:
                    weight_num += float(language.rstrip('%'))
            other_weight = str(round(100 - weight_num, 1))+'%'
            language_weight[4] = other_weight
            print(language_weight)
            for repo in repos:
                if flag == repo[1]:
                    repo = repo + (language_weight,)
                    new_repos.append(repo)
        else:
            continue
    # print(new_repos)
    return new_repos

def get_github_repos_all(classify_list, save_file=None):
    time_now = datetime.datetime.now()
    date_time = "%s-%s-%s" % (time_now.year, time_now.month, time_now.day)
    if not save_file:
        if not (os.path.exists(OUTPUT_DIRS_DEFAULT)):
            os.mkdir(OUTPUT_DIRS_DEFAULT)
        work_dir = os.path.join(OUTPUT_DIRS_DEFAULT, date_time)
        if not (os.path.exists(work_dir)):
            os.mkdir(work_dir)
        save_file = os.path.join(work_dir, "repos_in_github.xlsx")
    wb = openpyxl.Workbook()
    wb.remove(wb.active)
    sheet_list = list()
    for classify, url in classify_list:
        sheet_list.append(classify)
        sheet = wb.create_sheet(classify.replace('/', '&'))
        repos = __get_github_repos_in_class(classify, url)
        # print(repos)
        sheet.cell(1, 1, "repo name")
        sheet.cell(1, 2, "owner name")
        sheet.cell(1, 3, "language_weight")
        sheet.cell(1, 4, "stars")
        sheet.cell(1, 5, "size")
        sheet.cell(1, 6, "forks")
        sheet.cell(1, 7, "issues")
        sheet.cell(1, 8, "url")
        sheet.cell(1, 9, "update_time")
        sheet.cell(1, 10, "create_time")
        sheet.cell(1, 11, "description")
        row = 2
        for repo in repos:
            language_str = ''
            sheet.cell(row, 1, repo[0])
            sheet.cell(row, 2, repo[1])
            if repo[11][0] != 0:
                language_str += 'C:' + repo[11][0]
            if repo[11][1] != 0:
                language_str += ' C++:' + repo[11][1]
            if repo[11][2] != 0:
                language_str += ' Python:' + repo[11][2]
            if repo[11][3] != 0:
                language_str += ' JavaScript:' + repo[11][3]
            if repo[11][4] != 0:
                language_str += ' Others:' + repo[11][4]
            sheet.cell(row, 3, language_str.strip())
            sheet.cell(row, 4, repo[3])
            sheet.cell(row, 5, repo[4])
            sheet.cell(row, 6, repo[5])
            sheet.cell(row, 7, repo[6])
            sheet.cell(row, 8, repo[7])
            sheet.cell(row, 9, repo[8])
            sheet.cell(row, 10, repo[9])
            sheet.cell(row, 11).value = ILLEGAL_CHARACTERS_RE.sub(r'', str(repo[10]))
            row += 1
    wb.save(save_file)
    sort_excel(save_file)
    return

def sort_excel(file, save_file_dir=None):
    time_now = datetime.datetime.now()
    date_time = "%s-%s-%s" % (time_now.year, time_now.month, time_now.day)
    if not save_file_dir:
        if not (os.path.exists(OUTPUT_DIRS_DEFAULT)):
            os.mkdir(OUTPUT_DIRS_DEFAULT)
        work_dir = os.path.join(OUTPUT_DIRS_DEFAULT, date_time)
        if not (os.path.exists(work_dir)):
            os.mkdir(work_dir)
        save_file_dir = os.path.join(work_dir, "repos_in_github_afterMerge.xlsx")
    df = pd.ExcelFile(file)
    sheet_names = df.sheet_names
    df1 = pd.read_excel(file, sheet_name=0)
    for i in range(1, len(sheet_names)):
        df2 = pd.read_excel(file, sheet_name=i)
        df1 = pd.concat([df1, df2], ignore_index=True)
    df1 = pd.DataFrame(df1)
    print(df1)
    df_afterMerge = df1.copy().drop_duplicates('owner name', inplace=False)
    print(type(df_afterMerge))
    try:
        print(df_afterMerge)
    except:
        print('wrong here')
    try:
        df_afterMerge.sort_values(by='stars', inplace=True, ascending=False)
    except:
        print('wrong here 2')
    writer = pd.ExcelWriter(save_file_dir)
    df_afterMerge.to_excel(writer, index=False)
    writer.save()
    writer.close()


if __name__ == "__main__":
    time_now = datetime.datetime.now()
    date_time = "%s-%s-%s" % (time_now.year, time_now.month, time_now.day)
    if not (os.path.exists(OUTPUT_DIRS_DEFAULT)):
        os.mkdir(OUTPUT_DIRS_DEFAULT)
    work_dir = os.path.join(OUTPUT_DIRS_DEFAULT, date_time)
    if not (os.path.exists(work_dir)):
        os.mkdir(work_dir)
    save_file = os.path.join(work_dir, "repos_in_github.xlsx")
    sort_excel(save_file)

